
********************************************************************************
*** 							DATA CLEANING				   				 ***
********************************************************************************

** Define paths **
global data "[insert_path]/Data"												// insert path to data files
global results "[insert_path]/Results"											// insert path to results folder			
cd "$results" 																	

** MERGE DATA ** 

use "$data/HPACC_LMIC_Pt1_2024-11-26.dta", clear 						// n=226,086; 25 surveys

append using "$data/HPACC_LMICs_Pt2_2024-11-26.dta" 					// n=1,322,697; 67 surveys

append using "$data/HPACC_main_HIC_2024-11-26.dta"						// n=108,063; 19 surveys

append using "/$data/HPACC_DUA_2024-11-26.dta"	// n=83,528; 8 surveys 		

count 							// should be N=1,740,374


** CLEANING DATASET ** 

* Deleting variables not needed for this paper 

keep country Country rural svy dia_medincl ex_hbg ex_fbg p_id hh_id year psu Population2015 sex age educat educat3 educat_lcl edyears fbg fbg_new hba1c_m fast_new bg_ms_new asset_index controlled_dia ///
pregnant countryGDPclass clin_dia wealth_quintile stratum_num psu_num hba1c_p dia_med dia_med_new insulin insulin_new hbg_new dia_adv w1 w2 w3 wstep1 wstep2 wstep3 u24e u24f adv_bwd adv_pad adv_fvd ///
bmi mi sbp1 sbp2 sbp3 sbp_avg_new sbp_nmeasures dbp1 dbp2 dbp3 dbp_avg_new dbp_nmeasures clin_hypt hypt_new hypt_med_new hypt_adv all_disl csmoke bmicat ex_age ///
chol_ms hchol hchol12 chol_med tchol_mmoll chol_med2 hdl_mmoll tchol_mgdl tchol_mgdl2 hdl_mgdl hdl_mgdl2 ldl_mgdl ldl_mmoll high_chol high_ldl low_hdl tchol_to_hdl ex_year

save "$data/glp1_preclean.dta", replace
use "$data/glp1_preclean.dta", clear

* Inclusion criteria for surveys: 
/*
- 2009 or later
- Data on BMI and measured BP
- Data on diabetes biomarkers
*/

* Deleting countries not fulfilling our inclusion criteria 		

drop if country == "Belize" ///					// 2008 or earlier				
| country == "Niger" 

*drop if country == ""							// All countries have BMI data
*drop if country == ""							// All countries have BP data			
							
drop if country == "Albania" ///				// No diabetes biomarkers	
| country == "Bahamas" ///
| country == "Bolivia" ///
| country == "Cayman Islands" ///	
| country == "Egypt" ///
| country == "French Polynesia" ///
| country == "Gambia" ///
| country == "Ghana" ///								// also, collected 2007/8
| country == "Grenada" ///	
| country == "India LASI" ///			
| country == "Paraguay" ///
| country == "Russian Federation" ///					// also, collected 2007/8
| country == "Sierra Leone" ///
| country == "Tonga" 
			
* Deleting countries for other reasons	

drop if country == "Greece"						// No permission for use

drop if country == "South Africa DHS" 			// We use SANHANES instead					
	
drop if country == "China CHARLS"				// We use CHNS instead

* Merge SANHANES A1c from DUA data 

drop if mi(country)					// Drops DUA ZA data

preserve 
keep if country == "South Africa"  
drop hba1c_p
merge 1:1 p_id using "$data/South Africa_David.dta", keepusing(hba1c_p) 	// merges HbA1c for 4,615 participants
drop _merge
save "$data/country_update_ZA", replace 
restore  

drop if country == "South Africa"  
append using "$data/country_update_ZA"

* Rename countries
replace country = "Cabo Verde" if country == "Cabo Verde 2020"
replace country = "Chile" if country == "Chile ENS"
replace country = "Malawi" if country == "Malawi 2017" 
replace country = "Mongolia" if country == "Mongolia 2019"
replace country = "Peru" if country == "Peru 2017"
replace country = "STP" if country == "Sao Tome and Principe 2019"
replace country = "Sri Lanka" if country == "Sri Lanka 2021"
replace country = "Vietnam" if country == "Vietnam 2021" 
replace country = "USA" if country == "United States of America" 

sort country 

* Labeling variables & categories

label variable sex "Sex"
	label define sex 0 "Male" 1 "Female" 
	label values sex sex
label variable educat3 "Education"
	label define educat3 0 "No formal schooling" 1 "Primary School" 2 "Secondary School or above" 
	label values educat3 educat3
label variable educat "Education (broader categories)"
	label define educat5 0 "No formal schooling" 1 "Less than primary school (<6)" 2 "Primary school completed (=6)" 3 "Some high school (7-11)" 4 "High school or above(≥12)"
	label values educat educat5
label variable wealth_quintile "Wealth index by quintile"
	label define wealth_quintile 1 "Q1" 2 "Q2" 3 "Q3" 4 "Q4" 5 "Q5"
	label values wealth_quintile wealth_quintile
label define GDP2 1 "Low-income country" 2 "Lower-middle-income country" 3 "Upper-middle-income country" 4 "High-income country"
	label values countryGDPclass GDP2

** FIXING COUNTRY-SPECIFIC ISSUES **		

* India: weights for  en			// Falsely dividied by 15 instead of 0.15
replace w3 = w3*(15/0.15) if country == "India" & sex == 0

* Argentina: Ex_fbg
replace fbg_new = ex_fbg if country == "Argentina"

* Marshall Islands: Diabetes diagnosis
replace hbg_new = 0 if ex_hbg == 999999999 & country == "Marshall Islands"

* England: Age
replace age = ex_age if country == "England"

* Peru: A1c false
replace hba1c_p = . if country == "Peru"
replace hba1c_m = . if country == "Peru"

* China: Medication
replace dia_med_new = . if country == "China"
replace dia_med_new = 1 if country == "China" & u24e == 1		
replace dia_med_new = 0 if country == "China" & u24e == 0

replace dia_med_new = 0 if inlist(0, bg_ms_new, hbg_new) & dia_med == 1 & country == "China" 								
replace dia_med_new = 0 if inlist(0, bg_ms_new, hbg_new) & inlist(dia_med, 999999999,1000000000) & country == "China" 					
replace dia_med_new=. if bg_ms_new==. & !missing(bg_ms) & country == "China"	                 
replace dia_med_new=. if hbg_new==. & !missing(hbg) & country == "China" 				               
replace dia_med_new=. if dia_med_new>800 & country == "China" 															   
tab dia_med_new if country == "China",m							

replace insulin_new = . if country == "China"
replace insulin_new = 1 if country == "China" & u24f == "1"		
replace insulin_new = 0 if country == "China" & u24f == "0"

replace insulin_new = 0 if inlist(0, bg_ms_new, hbg_new) & insulin == 1 & country == "China"    							
replace insulin_new = 0 if inlist(0, bg_ms_new, hbg_new) & inlist(insulin, 999999999,1000000000) & country == "China"
replace insulin_new=. if bg_ms_new==. & !missing(bg_ms) & country == "China"                              
replace insulin_new=. if hbg_new==. & !missing(hbg) & country == "China"  							               
replace insulin_new=. if insulin_new>800 & country == "China" 														
tab insulin_new if country == "China", m						

drop u24e u24f

* Pakistan: Fix fasting status 

/*
- Glucose was only collected if people fasted at least 8 hours
- In current data, only participants who fasted ≥ 12 hours considered fasted
- Instead, we want to use local definition (fasted if fasting for ≥8 hours)  
--> Recode the "fast" variable to be 1 if the participant has a fbg level available
*/

replace fast_new = 1 if !missing(fbg_new) & country == "Pakistan"

* Nigeria: Fix diagnosis status variables						

/*
- REMAH didn't send us the variable "BP measured"
- Since the diagnosed variable ("hypt" or "hypt_new") is conditional on that response, it will be missing if the person says no. 
- Same goes for diabetes
--> For any missing value for "hypt_new" (or hbg_new), assume hypt_new is 0, unless sbp or dbp are missing. Do the same for diabetes.
*/

replace hypt_new = 0 if mi(hypt_new) & !mi(sbp_avg) & !mi(dbp_avg) & country == "Nigeria"
replace hbg_new = 0 if mi(hbg_new) & !mi(fbg_new) & country == "Nigeria"

** CLEANING VARIABLES & GENERATING NEW ONES **

* Wealth quintiles
replace wealth_quintile = . if wealth_quintile > 5

* Clean bmi 
replace bmi = . if bmi >80					// Higher values are placeholders for missingness

* Clean cardiovascular events
replace mi = . if mi >1

* Cearn cholesterol meds
replace chol_med = . if chol_med >1			// past 12 weeks
replace chol_med2 = . if chol_med2 >1		// past 2 weeks
replace hchol = . if hchol >1				// awareness of diagnosis

* Clean smoking
replace csmoke = . if csmoke >1	

* Clean education 	
replace educat = . if educat == .c	
replace edyears = . if edyears >95

* Clean sex
replace sex = . if sex > 1					// 151 to missing

* Pregnancy status 
replace pregnant = . if pregnant > 2

* Clean diabetes biomarkers (replace placeholders = .)
replace hba1c_p = . if hba1c_p < 3 								// 0 changes
replace hba1c_p = . if hba1c_p > 17								// 17 = largest value; rest are placeholders
replace hba1c_m = . if hba1c_m < 9.3 							// 0 changes
replace hba1c_m = . if hba1c_m > 162.4							// 162.31 = largest value; rest are placeholde	
replace fbg_new = . if fbg_new < 2.2  							// 0 changes
replace fbg_new = . if fbg_new > 33.3							// 33.3 = largest value; rest are placeholde
replace dia_med_new = . if dia_med_new > 1	| dia_med_new < 0	// placeholder = .
replace insulin_new = . if insulin_new > 1	| insulin_new < 0	// placeholder = .

* Re-create diabetes variable *

* Taking hypogylcemic meds
gen dia_therapy = .
replace dia_therapy = 1 if insulin_new == 1 | dia_med_new == 1
replace dia_therapy = 0 if (insulin_new == 0 & dia_med_new != 1) | (insulin_new != 1 & dia_med_new == 0) 			
replace dia_therapy = . if insulin_new ==. & dia_med_new == .					// Treatment only defined as missing if missing data on both insulin and other meds use

tab dia_therapy if hbg_new == 0 							// no one who is undiagnosed takes meds
mdesc dia_therapy if hbg_new == 0							// 28,105 (2.09%) of undiagnosed individuals "miss" medication data (most of them in countries with diabetes biomarkers in sub-sample only, e.g. ZA)
*bys country: mdesc dia_therapy 							// Pakistan missing treatment variable completely
replace dia_therapy = 0 if dia_therapy == . & hbg_new == 0 & country!= "Pakistan" 	// Define undiagnosed individuals as not receiving treatment (STEPS surveys assumes that undiagnosed are untreated)			
replace dia_therapy = . if country == "Pakistan" 			// 0 changes

* Pathologic biomarkers			
gen path_bio = .
replace path_bio = 0 if (country != "India" & country != "Nigeria") & ((fbg_new < 7 & fbg_new !=. & fast_new != 0)) 										// Assuming fasting, unless stated otherwise
replace path_bio = 0 if (country != "India" & country != "Nigeria") & (fbg_new < 11.1 & fbg_new !=. & fast_new == 0)
replace path_bio = 0 if (country != "India" & country != "Nigeria") & ((hba1c_p < 6.5 & hba1c_p !=. & mi(fbg_new))) 										// HbA1c only used as biomarker if missing FBG	

replace path_bio = 1 if (country != "India" & country != "Nigeria") & ((fbg_new >= 7 & fbg_new !=. & fast_new != 0) ///
| (fbg_new >= 11.1 & fbg_new !=.) | (hba1c_p >= 6.5 & hba1c_p !=. & mi(fbg_new))) 	

replace path_bio = 0 if (country == "India" | country == "Nigeria") & ((fbg_new < 7 & fbg_new !=. & fast_new == 1)) 										// Assuming no fasting, unless stated otherwise							
replace path_bio = 0 if (country == "India" | country == "Nigeria") & (fbg_new < 11.1 & fbg_new !=. & fast_new !=1)											// Both countries have no A1c measure

replace path_bio = 1 if (country == "India" | country == "Nigeria") & ((fbg_new >= 7 & fbg_new !=. & fast_new == 1) | (fbg_new >= 11.1 & fbg_new !=.)) 	
replace path_bio = . if fbg_new ==. & hba1c_p ==.	

* Main diabetes status variable: Biomarker OR diagnosis	
gen clin_dia_fx = .
replace clin_dia_fx = 1 if path_bio == 1 | hbg_new == 1			
replace clin_dia_fx = 0 if path_bio == 0 & hbg_new == 0	
replace clin_dia_fx = . if mi(path_bio) | mi(hbg_new)			// Drop folks without data on diagnosis separately later on			

* Alternative diabetes status variable: Biomarker OR treatment						
gen clin_dia_trt = .
replace clin_dia_trt = 1 if path_bio == 1 | dia_therapy == 1			
replace clin_dia_trt = 0 if path_bio == 0 & dia_therapy == 0
replace clin_dia_trt = . if mi(path_bio) | mi(dia_therapy)		
replace clin_dia_trt = clin_dia_fx if country == "Pakistan"					// Pakistan lacks dia_therapy		

drop clin_dia

* Re-create hypertension variable *		

* Antihypertensives
tab hypt_med_new if hypt_new == 0 												// 1,119 taking meds but undiagnosed
replace hypt_new = 1 if hypt_med_new == 1 
	
mdesc hypt_med_new if hypt_new == 0												// 34,791 (2.82%) of undiagnosed individuals "miss" medication data 
*bys country: mdesc hypt_med_new 												// England & Pakistan missing treatment variable completely
replace hypt_med_new = 0 if hypt_med_new == . & hypt_new == 0 ///
& country!= "Pakistan" & country != "England" 									// Define undiagnosed individuals as not receiving treatment (STEPS surveys assumes that undiagnosed are untreated)			
replace hypt_med_new = . if country == "Pakistan" | country == "England"		// 0 changes

* Clean variables		
replace dbp_nmeasures = . if dbp_nmeasures >3
replace sbp_nmeasures = . if sbp_nmeasures >3

* Main hypertension status variable: Biomarker OR diagnosis	
gen clin_hypt_fx = .
replace clin_hypt_fx = 1 if (sbp_avg_new >= 140 & sbp_avg_new !=.) | (dbp_avg_new >= 90 & dbp_avg_new !=.) | hypt_new == 1		
replace clin_hypt_fx = 0 if sbp_avg_new < 140 & dbp_avg_new < 90 & hypt_new == 0
replace clin_hypt_fx = . if mi(sbp_avg_new) | mi(dbp_avg_new) | mi(hypt_new) 												

* Alternative hypertension status variable: Biomarker OR treatment						
gen clin_hypt_trt = .
replace clin_hypt_trt = 1 if (sbp_avg_new >= 140 & sbp_avg_new !=.) | (dbp_avg_new >= 90 & dbp_avg_new !=.) | hypt_med_new == 1		
replace clin_hypt_trt = 0 if sbp_avg_new < 140 & dbp_avg_new < 90 & hypt_med_new == 0
replace clin_hypt_trt = . if mi(sbp_avg_new) | mi(dbp_avg_new) | mi(hypt_med_new) 												
replace clin_hypt_trt = clin_hypt_fx if country == "England" | country == "Pakistan"

drop clin_hypt

* New World regions (NCD-RisC 2024 as starting point)

/*
Europe and north America 			// Merger of high-income western (US, England, Portugal, Spain), central Europe (Romania, Czechia), and eastern Europe (Belarus, Ukraine, Moldova)
Central Asia						
Middle East and north Africa
Latin America and the Caribbean
Pacific island nations
South, east and southeast Asia		// Merger of "South Asia" and "East and Southeast Asia"; addition "and the Pacific" (likely referring to Taiwan) omitted
Sub-Saharan Africa					// We could further split up by sub-region (East, West, South)
*/

generate worldregion7 = .	
replace worldregion7 = 1 if country == "Belarus" | country == "Czech Republic" | country == "England" | country == "Germany" ///
| country == "Malta" | country == "Moldova" | country == "Portugal" | country == "Romania" | country == "Spain" ///
| country == "Ukraine" | country == "USA"  

replace worldregion7 = 2 if country == "Azerbaijan" | country == "Georgia" | country == "Kyrgyzstan" ///
| country == "Mongolia" | country == "Tajikistan" | country == "Turkmenistan" | country == "Kazakhstan"

replace worldregion7 = 3 if country == "Algeria" | country == "Iran" | country == "Iraq" ///
| country == "Jordan" | country == "Kuwait" | country == "Lebanon" | country == "Libya" ///
| country == "Morocco" | country == "Palestine" | country == "Qatar"

replace worldregion7 = 4 if country == "Argentina" | country == "Barbados" | country == "Brazil" | country == "Bermuda" ///
| country == "Chile" | country == "Costa Rica" | country == "Ecuador" | country == "El Salvador" ///
| country == "Guyana" | country == "Mexico" | country == "Haiti" | country == "Panama" ///
| country == "Peru" | country == "Saint Lucia" | country == "Trinidad and Tobago" | country == "Uruguay" | country == "Venezuela"

replace worldregion7 = 5 if country == "Cook Islands" | country == "Fiji" | country == "Kiribati" ///
| country == "Niue" | country == "Samoa" | country == "Solomon Islands" | country == "Tuvalu" ///
| country == "Vanuatu" | country == "Marshall Islands" | country == "Tokelau" | country == "Palau" ///
| country == "Nauru" | country == "Tonga" | country == "Wallis and Futuna"

replace worldregion7 = 6 if country == "Afghanistan" | country == "Bangladesh" | country == "Bhutan" | country == "Brunei" | country == "Cambodia" ///
| country == "China" | country == "India" | country == "Indonesia" | country == "Laos" | country == "Myanmar" | country == "Nepal" ///
| country == "Pakistan" | country == "Singapore" | country == "South Korea" | country == "Sri Lanka" | country == "Timor Leste" | country == "Vietnam"

replace worldregion7 = 7 if country == "Benin" | country == "Botswana" | country == "Burkina Faso" | country == "Cabo Verde" | country == "Comoros" ///
| country == "Eritrea" | country == "Eswatini" | country == "Ethiopia" | country == "Kenya" | country == "Lesotho" ///
| country == "Liberia" | country == "Malawi" | country == "Mozambique" | country == "Namibia" |  country == "Nigeria" | country == "Rwanda" ///
| country == "Seychelles" | country == "South Africa" | country == "STP" | country == "Sudan" | country == "Tanzania" ///
| country == "Togo" | country == "Uganda" | country == "Zambia" | country == "Zanzibar" 

label define wr7 1 "Europe and north America" 2 "Central Asia" 3 "Middle East and North Africa" 4 "Latin America and the Caribbean" 5 "Pacific island nations" 6 "South, east and southeast Asia" 7 "Sub-Saharan Africa"
label values worldregion7 wr7   
*mdesc worldregion7		// 0

* Obesity (standard definition) * 
gen obese_st = .
replace obese_st = 1 if bmi >=30
replace obese_st = 0 if bmi <30
replace obese_st = . if mi(bmi)
	
* Obesity: Asian cut-offs for SEA * 
gen obese_Asia = obese_st if worldregion7 !=6
replace obese_Asia = 0 if worldregion7 == 6
replace obese_Asia = 1 if worldregion7 == 6 & bmi >= 28 & bmi !=. 
replace obese_Asia = . if mi(bmi)
	
* GLP-1 Eligibility (without Asian cut-offs) *			
gen glp1_elig_noAs = 0
replace glp1_elig_noAs = 1 if bmi >= 30 & bmi !=. 
replace glp1_elig_noAs = 1 if bmi >= 27 & bmi !=. & (clin_dia_fx == 1 | clin_hypt_fx == 1)
replace glp1_elig_noAs = . if mi(bmi) | mi(clin_dia_fx) | mi(clin_hypt_fx)

* GLP-1 Eligibility: Asian cut-offs for SEA *
gen glp1_elig_Asia = glp1_elig_noAs if worldregion7 !=6
replace glp1_elig_Asia = 0 if worldregion7 == 6
replace glp1_elig_Asia = 1 if bmi >= 28 & bmi !=. & worldregion7 == 6
replace glp1_elig_Asia = 1 if bmi >= 24 & bmi !=. & (clin_dia_fx == 1 | clin_hypt_fx == 1) & worldregion7 == 6
replace glp1_elig_Asia = . if mi(bmi) | mi(clin_dia_fx) | mi(clin_hypt_fx)

* Variable: Share of GLP-1 eligible individuals that are identified due to comorbidity		// Note: Does not account for people with BMI ≥30 (28) and comorbidity 
gen glp1_comorb = .
replace glp1_comorb = 0 if glp1_elig_Asia == 1
replace glp1_comorb = 1 if bmi >= 27 & bmi <30 & bmi !=. & (clin_dia_fx == 1 | clin_hypt_fx == 1) & worldregion7 !=6
replace glp1_comorb = 1 if bmi >= 24 & bmi <28 & bmi !=. & (clin_dia_fx == 1 | clin_hypt_fx == 1) & worldregion7 ==6
replace glp1_comorb = . if mi(bmi) | mi(clin_dia_fx) | mi(clin_hypt_fx)
	
* Age categories 
gen age_cat = .
replace age_cat = 1 if age >=25 & age <35
replace age_cat = 2 if age >=35 & age <45
replace age_cat = 3 if age >=45 & age <55
replace age_cat = 4 if age >=55 & age <65
label define agct 1 "25-34" 2 "35-44" 3 "45-54" 4 "55-64" 
label values age_cat agct

* Age categories (for WHO standard distribution) 
egen agecat_pre = cut(age), at(25, 30, 35, 40, 45, 50, 55, 60, 65)
egen agecat_std = group(agecat_pre)
*bys agecat_std: sum age
label define std 1 "25-29" 2 "30-34" 3 "35-39" 4 "40-44" 5 "45-49" 6 "50-54" 7 "55-59" 8 "60-64" 
label values agecat_std std

* Generate countrycat
egen countrycat = group (country)

* Numeric year variable 			// If more than 1 year: Calculate middle and round down
gen year_num = year
replace year_num = "2009" if year_num == "2008-2011"
replace year_num = "2009" if year_num == "2008-2010"
replace year_num = "2012" if year_num == "2011-2013"
replace year_num = "2013" if year_num == "2013-2014"
replace year_num = "2014" if year_num == "2013-2015"
replace year_num = "2014" if year_num == "2014-2015"
replace year_num = "2015" if year_num == "2014 - 2017"
replace year_num = "2015" if year_num == "2014-2016"
replace year_num = "2017" if year_num == "2015 - Mar2020"
replace year_num = "2015" if year_num == "2015-2016"
replace year_num = "2016" if year_num == "2015-2017"
replace year_num = "2016" if year_num == "2016-2017"
replace year_num = "2017" if year_num == "2017-2018"
replace year_num = "2018" if year_num == "2018-2019"
replace year_num = "2019" if year_num == "2019-2020"
replace year_num = "2020" if year_num == "2018-2021"
replace year_num = "2020" if year_num == "2019-2021"
destring year_num, replace

* Indicator for surveys with age limit 25-64		
gen age_25to64 = 1			
forval x = 1/100 {
sum age if countrycat == `x'
replace age_25to64 = 0 if countrycat == `x' & `r(min)' !=25
replace age_25to64 = 0 if countrycat == `x' & `r(max)' !=64
}
mdesc age_25to64					// 0
tab country if age_25to64 == 1		// 12 surveys have age limit between 25-64

* Append standard age weights
merge m:1 agecat_std using "/Users/felixteufel/OneDrive - Emory/DESKTOP/GLP-1 eligibility/Data/Age standard/Age_standard_25-64.dta"
*sum age if age >=25 & age <65 & _merge==1
drop _merge	

* Append UN population size data for 100 included countries *																			
* By country
merge m:1 country using "/Users/felixteufel/OneDrive - Emory/DESKTOP/GLP-1 eligibility/Data/World Population Prospects/Collated/BY COUNTRY/Pop_2020_10Feb25.dta"
*mdesc Population2020 	// 0
keep if _merge==3		// drops pop size estimates for Bahamas, Cayman Islands, French Polynesia		
drop _merge	
replace Population2020_age25to64 = Population2020_age25to64*1000
replace Population2020 = Population2020*1000

* By country, sex, and age group
merge m:1 country using "/Users/felixteufel/OneDrive - Emory/DESKTOP/GLP-1 eligibility/Data/World Population Prospects/Collated/BY COUNTRY SEX AGE/Pop_2020_strat_15May25.dta"
*mdesc Population2020_* 	// 0
keep if _merge==3		// drops pop size estimates for Bahamas, Cayman Islands, French Polynesia		
drop _merge

gen Population2020_strat = .
foreach x of num 1/8 {
replace Population2020_strat = Population2020_male_`x' if sex == 0 & agecat_std == `x'
replace Population2020_strat = Population2020_fem_`x' if sex == 1 & agecat_std == `x' 
}
mdesc Population2020_strat if age >=25 & age <=64 & !mi(sex) // 0
drop Population2020_male_* Population2020_fem_*

* Append response rates																													
merge m:1 country using "/Users/felixteufel/OneDrive - Emory/DESKTOP/GLP-1 eligibility/Data/Response rates/Data import.dta"		
keep if _merge==3		// drops response rates of countries not in dataset
drop _merge

*** Save full data set ***
save "$data/glp1_full.dta", replace



********************************************************************************
*** 				  SUBSET DATA & IMPLEMENT WEIGHTING		   				 ***			
********************************************************************************
	
** SUB-SET DATA **	

use "$data/glp1_full.dta", clear

* Full analytic sample
gen analytic_pop_all = 1									
replace analytic_pop_all = 0 if age <25 | (age >64 & !mi(age))
replace analytic_pop_all = 0 if mi(age) | mi(sex)
replace analytic_pop_all = 0 if pregnant ==1 
replace analytic_pop_all = 0 if mi(clin_dia_fx)		// combines biomarker and diagnosis
replace analytic_pop_all = 0 if mi(bmi)
replace analytic_pop_all = 0 if mi(clin_hypt_fx)	// combines biomarker and diagnosis
replace analytic_pop_all = 0 if mi(psu_num)
replace analytic_pop_all = 0 if w3 == 0 

mdesc analytic_pop_all 			// missingness should be 0 ==> Verified
tab analytic_pop_all			// should be = N3 ==> Verified


** IMPLEMENT SAMPLING WEIGHTS **																				

* Address exceptions *
replace w3 = w2 if country == "USA" 		// w3 is wtsaf4yr (50% fasting sub-sample), whereas w2 is wtmec2yr (all examinations incl. A1c)

* Generate weight for full analytic sample *		
generate w3_analytic= w3 if analytic_pop_all == 1			

* By country: Impute missing values of weights 
mdesc w3_analytic if analytic_pop_all == 1	 						// Missing for 793
bys country: egen double w3_analytic_mean = mean(w3_analytic) if w3_analytic!=0 	// Note: No observation actually has w3_analytic_all==0 
replace w3_analytic = w3_analytic_mean if w3_analytic ==. & analytic_pop_all == 1
mdesc w3_analytic if analytic_pop_all == 1		// 0

* Weighting each country equally
tab w3_analytic if w3_analytic <0.000001 // no obs with weight = 0
bys country: egen double all_obs = sum(w3_analytic) if w3_analytic !=. 
generate weq_w3_analytic=.
replace weq_w3_analytic = w3_analytic / all_obs				
/*
sum countrycat
forvalues x = 1/`r(max)' { 
sum weq_w3_analytic if countrycat == `x'	
display r(sum)										// should sum to 1 for each country --> verified 
}
*/

* Weighting each country according to its population size in 2020 *		

* Accounting for changing age-sex structure

bys country agecat_std sex: egen double all_obs_strat = total(w3_analytic) if w3_analytic != .
generate wpop_w3_analytic =.
replace wpop_w3_analytic = w3_analytic*Population2020_strat/all_obs_strat 	
/*
sum countrycat
forvalues x = 1/`r(max)' { 
sum wpop_w3_analytic if countrycat == `x'	
display r(sum)	
tab	Population2020_age25to64 if countrycat == `x'	 
}
*/
// r(sum) and Population2020_age25to64 should have same value for each country (implies that we still weight countries according to overall popsize) --> Verified
// Exception: Countries that don't have observations across entire 25-64 age range

* Uniform weights (ages 25-64)							
generate wpop_w3_analytic_uni =.
replace wpop_w3_analytic_uni = w3_analytic*Population2020_age25to64/all_obs 
/*
sum countrycat
forvalues x = 1/`r(max)' { 
sum wpop_w3_analytic_uni if countrycat == `x'	
display r(sum)	
tab	Population2020_age25to64 if countrycat == `x'		// r(sum) and Population2020_age25to64 should have same value for each country --> Verified
}
*/

* Uniform weights (all ages) 
generate wpop_w3_analytic_ageall =.
replace wpop_w3_analytic_ageall = w3_analytic*Population2020/all_obs 
/*
sum countrycat
forvalues x = 1/`r(max)' { 
sum wpop_w3_analytic_ageall if countrycat == `x'	
display r(sum)	
tab	Population2020 if countrycat == `x'					// r(sum) and Population2020 should have same value for each country --> Verified
}
*/

*  Verify completeness *
mdesc weq_w3_analytic if analytic_pop_all == 1					// 0
mdesc wpop_w3_analytic if analytic_pop_all == 1					// 0
mdesc wpop_w3_analytic_uni if analytic_pop_all == 1				// 0
mdesc wpop_w3_analytic_ageall if analytic_pop_all == 1			// 0

** SET SURVEY DESIGN **								// "singleunit(centered)" implies that strata with a single sampling unit are centered at the grand mean instead of the stratum mean
gen strata_combined_str = string(countrycat) + "_" + string(stratum_num)
encode strata_combined_str, gen(strata_combined)
svyset psu_num[pw = wpop_w3_analytic], strata(strata_combined) singleunit(centered)  		
*svyset psu_num[pw = wpop_w3_analytic], strata(stratum_num) singleunit(centered)  		

** SAVE **
save "$data/glp1_final.dta", replace
	
